Gender bias in speakers and career position
Data
EcoEncontros Seminar talks
Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil
See file metadata.txt, in folder data for more description and detail of the dataset.
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)For this specific analysis, excluding speakers that are not in academia (âothersâ), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.
*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.
data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student",
"postdoc","professor")Excluding seminars with more than one speaker
events <- data %>% count(id) %>% filter(n>1)
data2 <- data %>% filter(!id %in% events$id,
!is.na(audience_n))dim(data)## [1] 330 30
Data description
Audience by gender and academic position
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
geom_boxplot() #geom_violin(position = position_dodge(0.8)) +
#geom_jitter(position=position_jitterdodge(0.2),shape=21)library(ggbeeswarm)
# outra opção
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("") + ylab("Audience (N)")Variation in time
ggplot(data, aes(x=date, y=audience_n, fill=gender)) +
facet_wrap(~position_cat, ncol=1)+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_quasirandom(dodge.width = 0.9, shape=21)+
geom_smooth()+
xlab("") + ylab("Audience (N)")Looking for possible biases for speakers from inside and outside PPGE.
data$ppge <- ifelse(data$origin == "IB", "inside", "outside")
table(data$gender,data$ppge)##
## inside outside
## F 88 53
## M 91 98
ggplot(data, aes(x=ppge, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("PPGE") + ylab("Audience (N)")Looking for possible biases for speakers from Brazil and abroad.
data$brazilian <- ifelse(data$country == "Brasil", "yes", "no")
table(data$gender,data$brazilian)##
## no yes
## F 25 116
## M 51 138
ggplot(data, aes(x=brazilian, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("Brazilian") + ylab("Audience (N)")Modeling
Negative binomial
mg0 <- glm.nb(audience_n~ 1, data=data)
mg1 <- glm.nb(audience_n~ gender, data=data)
mg2 <- glm.nb(audience_n~ position_cat, data=data)
mg3 <- glm.nb(audience_n~ year, data=data)
mg4 <- glm.nb(audience_n~ gender + position_cat, data=data)
mg5 <- glm.nb(audience_n~ gender + year, data=data)
mg6 <- glm.nb(audience_n~ year + position_cat, data=data)
mg7 <- glm.nb(audience_n~ gender*position_cat, data=data)
mg8 <- glm.nb(audience_n~ gender*year, data=data)
mg9 <- glm.nb(audience_n~ year*position_cat, data=data)
mg10 <- glm.nb(audience_n~ gender*position_cat*year, data=data)
AICtab(mg2, mg0,mg1, mg3, mg4,mg5,mg6,mg7,mg8,mg9,mg10)## dAIC df
## mg7 0.0 7
## mg4 1.7 5
## mg6 2.2 5
## mg2 4.1 4
## mg10 4.4 13
## mg9 5.2 7
## mg5 30.3 4
## mg1 30.9 3
## mg8 31.9 5
## mg3 42.5 3
## mg0 42.7 2
Residual diagnostic
hnp::hnp(mg7)## Negative binomial model (using MASS package)
plot(simulateResiduals(mg7))hnp::hnp(mg4)## Negative binomial model (using MASS package)
plot(simulateResiduals(mg4))Models result
summary(mg7)##
## Call:
## glm.nb(formula = audience_n ~ gender * position_cat, data = data,
## init.theta = 6.535878002, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9224 -0.7849 -0.1143 0.5305 3.6658
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.93339 0.04760 61.621 <2e-16 ***
## genderM 0.05745 0.06888 0.834 0.404
## position_catpostdoc 0.01944 0.10234 0.190 0.849
## position_catprofessor 0.13000 0.10127 1.284 0.199
## genderM:position_catpostdoc -0.06749 0.13922 -0.485 0.628
## genderM:position_catprofessor 0.26468 0.12339 2.145 0.032 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.5359) family taken to be 1)
##
## Null deviance: 392.02 on 329 degrees of freedom
## Residual deviance: 334.86 on 324 degrees of freedom
## AIC: 2401.8
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.536
## Std. Err.: 0.657
##
## 2 x log-likelihood: -2387.781
performance::r2(mg7)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.229
myg7 <- ggpredict(mg7, terms=c("position_cat","gender"))
plot(myg7) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))summary(mg4)##
## Call:
## glm.nb(formula = audience_n ~ gender + position_cat, data = data,
## init.theta = 6.390867873, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8952 -0.7755 -0.1604 0.4810 3.7995
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.90807 0.04276 68.010 < 2e-16 ***
## genderM 0.11084 0.05196 2.133 0.0329 *
## position_catpostdoc -0.02214 0.06993 -0.317 0.7516
## position_catprofessor 0.31862 0.05776 5.517 3.45e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.3909) family taken to be 1)
##
## Null deviance: 385.44 on 329 degrees of freedom
## Residual deviance: 334.94 on 326 degrees of freedom
## AIC: 2403.5
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.391
## Std. Err.: 0.638
##
## 2 x log-likelihood: -2393.505
performance::r2(mg4)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.206
myg4 <- ggpredict(mg4, terms=c("position_cat","gender"))
plot(myg4) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))Only professors - productivity metrics
Investigating if differences in productivity between male and female professors and researches are related to the audience.
Measured productivity publication metrics from Google Scholar for professors and researchers.
Creating productivity index using PCA 1st axis from metrics.
PCA productivity metrics
dp <- data %>% filter(!is.na(data$total_citation_n),
!is.na(data$nature_index_count))
table(dp$gender)##
## F M
## 20 69
Productivity publication metrics
pca1 <- PCA(dp[, c(21:28)], graph=F)
p1 <- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
col.ind.sub="none", geom="point",
repel = TRUE) +
scale_color_manual(name="gender",values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(name="gender",values = c("#b2abd2", "#fdb863"))
p1Extracting PCA 2 first axes
dp$pc1 <- pca1$ind$coord[,1]
dp$pc2 <- pca1$ind$coord[,2]Modeling
m0 <- glm.nb(audience_n ~ 1, data=dp)
m1 <- glm.nb(audience_n ~ gender, data=dp)
m2 <- glm.nb(audience_n ~ pc1 + pc2, data=dp)
m3 <- glm.nb(audience_n ~ gender + pc1 + pc2, data=dp)
m4 <- glm.nb(audience_n ~ gender*pc1 + gender*pc2, data=dp)
AICtab(m0,m1,m2,m3,m4, base=T, weights=T)## AIC dAIC df weight
## m3 709.3 0.0 5 0.556
## m4 711.5 2.3 7 0.179
## m2 711.9 2.7 4 0.148
## m1 712.6 3.4 3 0.103
## m0 716.6 7.3 2 0.014
Residual diagnostic
Best model
hnp(m3)## Negative binomial model (using MASS package)
plot(simulateResiduals(m3)) ## Model results
summary(m3)##
## Call:
## glm.nb(formula = audience_n ~ gender + pc1 + pc2, data = dp,
## init.theta = 5.308640158, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5620 -0.7413 -0.2038 0.4620 3.3773
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.11234 0.10922 28.497 < 2e-16 ***
## genderM 0.27429 0.12375 2.217 0.02666 *
## pc1 0.07082 0.02460 2.879 0.00399 **
## pc2 -0.01647 0.03867 -0.426 0.67009
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.3086) family taken to be 1)
##
## Null deviance: 105.618 on 88 degrees of freedom
## Residual deviance: 91.338 on 85 degrees of freedom
## AIC: 709.27
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 5.309
## Std. Err.: 0.935
##
## 2 x log-likelihood: -699.267
performance::r2(mg3)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.010
my3 <- ggpredict(m3, terms=c("pc1","gender"))
plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot() + ggtitle("") +
ylab("Audience (N)") + xlab("Productivity (PC1 axis)")plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot() + ggtitle("") +
ylab("Audience (N)") + xlab("Productivity (PC1 axis)")my3 <- ggpredict(m3, terms=c("pc2","gender"))
plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot()